1 Load Libraries

packages <- function(x){
  x <- as.character(match.call()[[2]])
  if (!require(x,character.only=TRUE)){
    install.packages(pkgs=x,repos="http://cran.r-project.org")
    require(x,character.only=TRUE)
  }
}

packages(ggplot2)
packages(googleVis)
packages(reshape2)
packages(plotly)
packages(RColorBrewer)
packages(readr)
packages(rpart)
packages(dplyr)
packages(rpart.plot)

op <- options(gvis.plot.tag='chart')

2 Load Sample Loan Data

setwd("~/LDCDemo")
loan <- read_csv("loan_df_reclean_new.csv")

head(loan)

3 Exploratory data analysis

3.1 Target Variable: Loan Status

plot_status <- as.data.frame(table(loan$loan_status))
colnames(plot_status) <- list("Loan Status", "Freq")

status_bar <- gvisColumnChart(plot_status, options = list(legend = "none", width = 400, 
    height = 200))

status_pie <- gvisPieChart(plot_status, options = list(width = 400, height = 200))

merged_plot <- gvisMerge(status_bar, status_pie, tableOptions = "cellspacing=\"20\" bgcolor=\"#AABBCC\"", 
    horizontal = TRUE)

cat(merged_plot$html$chart, file = "merged_plot.html")

plot_status

3.2 Total Loss Due to Bad Loans

total_loss <- loan %>% select(loan_amnt, label) %>% group_by(label) %>% summarise(total_loan_amnt = sum(loan_amnt))
total_loss$Loan_Status <- c("Bad", "Good")
total_loss$count.style <- c("red", "darkblue")

plot_loss <- gvisBarChart(total_loss, xvar = "Loan_Status", yvar = c("total_loan_amnt", 
    "count.style"), options = list(title = "Total Loss Due to Bad Loan", height = 400, 
    width = 800, legend = "none"))

plot(plot_loss)

3.3 Grade vs. Interest: A to G, the interest increases.

p_grade_interest <- plot_ly(loan, y = ~int_rate, color = ~grade, type = "box")
p_grade_interest

3.4 Grade vs. Loan Status

plot_grade <- loan %>% select(grade, label) %>% group_by(grade, label) %>% summarise(total = n())

reshaped <- dcast(plot_grade, grade ~ label)

SteppedArea <- gvisSteppedAreaChart(reshaped, xvar = "grade", yvar = c("0", 
    "1"), options = list(isStacked = "percent"))

plot(SteppedArea)

3.5 Loan Amount by State: Most of the money goes to California.

value_by_state <- loan %>% group_by(addr_state) %>% summarise(value = sum(loan_amnt, 
    na.rm = TRUE))

tbl <- head(value_by_state[order(-value_by_state$value), ], 10)
colnames(tbl)[2] <- "Total Loan Amt"
Tbl <- gvisTable(tbl, options = list(height = 300, width = 200))

GeoStates <- gvisGeoChart(value_by_state, "addr_state", "value", options = list(region = "US", 
    displayMode = "regions", resolution = "provinces", width = 600, height = 400))



plot(gvisMerge(GeoStates, Tbl, horizontal = TRUE))

3.6 Default Rate by Loan Purpose

plot_purpose <- loan %>% select(loan_amnt, label, purpose) %>% group_by(purpose) %>% 
    summarise(total = n(), avg_loan = mean(loan_amnt))

Bubble <- gvisBubbleChart(plot_purpose, idvar = "purpose", xvar = "total", yvar = "avg_loan", 
    colorvar = "purpose", sizevar = "total", options = list(title = "Major Loan Purpose", 
        hAxis = "{ title: 'Count',
                                     titleTextStyle: {color: 'black'},
                                     viewWindowMode:'explicit',
                                     viewWindow: {max: 225000}}", 
        vAxis = "{ title: 'Average Loan Amount',
                                     titleTextStyle: {
                                     color: 'black'}}", 
        legend = "none", colorAxis = "{colors: ['red', 'blue']}", height = 400, 
        width = 800))

cat(Bubble$html$chart, file = "Bubble.html")

df_rate <- loan %>% select(label, purpose) %>% group_by(purpose) %>% summarise(total_n = n(), 
    full_payment_rate = mean(label))
df_rate$default_rate <- 1 - round(df_rate$full_payment_rate, 2)
df_rate <- df_rate[order(df_rate$default_rate), ]

# control bar color
colfunc <- colorRampPalette(c("darkgreen", "red"))
df_rate$count.style <- colfunc(14)

Bar <- gvisBarChart(df_rate, xvar = "purpose", yvar = c("default_rate", "count.style"), 
    options = list(title = "Default Rate by Loan Purpose", height = 400, width = 800, 
        legend = "none"))
# plot(Bar) cat(Bar$html$chart, file = 'Bar.html')

plot(gvisMerge(Bubble, Bar, horizontal = F))

3.7 DTI

loan$label_fct <- ifelse(loan$label == 1, "Good", "Bad")
loan_sub <- filter(loan, loan$dti < 100)

dti_box <- ggplot(loan_sub, aes(label_fct, dti, fill = label_fct)) + geom_boxplot(outlier.shape = NA) + 
    ggtitle("Bad Loans Have Higher Debt-to-Income Ratio")

dti_box <- plotly_build(dti_box)

dti_box$x$data <- lapply(dti_box$x$data, FUN = function(x) {
    x$marker = list(opacity = 0)
    return(x)
})

dti_box

3.8 Home Ownership vs. Loan Status

plot_home <- loan %>% select(home_ownership, label_fct, loan_amnt) %>% group_by(home_ownership, 
    label_fct) %>% summarise(countn = n())

reshaped_home <- dcast(plot_home, label_fct ~ home_ownership)

SteppedArea_home <- gvisSteppedAreaChart(reshaped_home, xvar = "label_fct", 
    yvar = names(reshaped_home)[2:5], options = list(isStacked = "percent", 
        title = "Home Ownership vs. Loan Status"))

plot(SteppedArea_home)

3.9 Desicion Tree

loan <- read_csv("testData_prob.csv")

loan_sample <- sample_n(loan, 10000)


fit <- rpart(label ~ home_ownership + grade + dti,
             data=loan,
             method="class", control =rpart.control(minsplit = 100 ,minbucket= 4, cp = .001))

rpart.plot(fit, tweak=1)